
# Replication code for Greenhill, Ward and Sacks (2011) "The Separation Plot: A New Visual Method for Evaluating the Fit of Binary Models", American Journal of Political Science 55 (3): x-xx.

# Compiled by Brian Greenhill, June 16, 2011.

# Notes:
# The functions to generate separationplots can be found in the library called "separationplot" on CRAN.  Replicating the output in the paper also depends upon downloading various other replication data sets, as referenced below.


# begin by loading the separationplot library:
library(separationplot)


### Figure 1:

library(ROCR)
pred.object<-prediction(c(0.774, 0.364, 0.997, 0.728, 0.961, 0.422), c(0,0,1,0,1,1))
perf.object<-performance(pred.object,"tpr","fpr")
par(las=1)
plot(perf.object, lwd=2)
points(x=c(1, 2/3, 2/3, 1/3, 0, 0, 0), y=c(1, 1, 2/3, 2/3, 2/3, 1/3, 0), cex=1.2, pch=19)


### Figure 2:

separationplot(pred=c(0.774, 0.364, 0.997, 0.728, 0.961, 0.422), actual=c(0,0,1,0,1,1), shuffle=F, line=F, type="rect", rectborder=1)


### Figure 3:

# Make a larger hypothetical plot:

library(MASS)
set.seed(1)
Sigma <- matrix(c(1,0.78,0.78,1),2,2)
a<-(mvrnorm(n=500, rep(0, 2), Sigma))
a[,2][a[,2]>0.75]<-1
a[,2][a[,2]<=0.75]<-0
a[,1]<-a[,1]-min(a[,1])
a[,1]<-a[,1]/max(a[,1])

cor(a) # should be 0.55

model1<-glm(a[,2]~a[,1], family=binomial(link = "logit"))

library(Hmisc)
somers2(model1$fitted.values, model1$y)

separationplot(pred=model1$fitted.values, actual=model1$y, type="rect", line=F, show.expected=F)


### Figure 4:

separationplot(pred=model1$fitted.values, actual=model1$y, type="rect", line=T, show.expected=F)


### Figure 5:

separationplot(pred=model1$fitted.values, actual=model1$y, type="rect", line=T, show.expected=T)

sum(model1$fitted.values) # should be 118


### Figure 6:

# show a "perfect" plot:
e<-plogis(seq(-10, 3.02, length.out=500))
sum(e) # should be 118
f<-c(rep(0, 382), rep(1, 118))
separationplot(pred=e, actual=f, type="rect", show.expected=T)


### Figure 7:

# Figure 7 uses data from the ICEWS project that is not publicly available.


### Figure 8:

# Note that Figure 8 uses replication data and R code obtained from the replication data of Hillygus and Jackman (2003), available at http://www.people.fas.harvard.edu/~hillygus/research.html

library(MASS)
library (nnet)

setwd("/Users/brian/Brian/Dichotomous Outcomes/data/Hillygus and Jackman AJPS 2003")
conv <- dget(file="conv.dpt")
debate <- dget(file="debate.dpt") 

logitconv <- glm(vote ~ -1 + prevote
                 + (age + I(age^2)+female + interest2*(democ + repub) + unfavorcln+apprv):prevote,
                 data=conv,
                 weights=weightn,
                 subset=edat.post<241,   ## NOTE NEW CUTOFF DATE
                 na.action=na.omit,
                 epsilon=1e-14,maxit=1000,trace=T,
                 family=binomial(link=logit),
                 x=T)

summary(logitconv, correlation=F)

logitdeb <- glm(vote ~ -1 + prevote + (age + I(age^2)+female + interest2*(democ + repub) + 
unfavorcln+apprv):prevote, data=debate,weights=debate$weightn, 
subset=((edat.pre>240&edat.post<303) & srvy2510==0),na.action=na.omit, 
epsilon=1e-14,maxit=1000,trace=T, family=binomial(link=logit),x=T)

summary(logitdeb, correlation=F)

separationplot(pred=logitconv$fitted.values, actual=logitconv$y, lwd1=0.25, heading="Convention Model", type="line", show.expected=T)

separationplot(pred=logitdeb$fitted.values, actual=logitdeb$y, lwd1=0.25, heading="Debate Model", type="line", show.expected=T)

# Do the other statistics:
library(Hmisc)
somers2(logitconv$fitted.values, logitconv$y)
somers2(logitdeb$fitted.values, logitdeb$y)

brier<-function(phat, y) (sum((phat-y)^2))/length(phat)

ePCP<-function(phat, y) (sum(phat[y==1]) + sum(1-phat[y==0]))/length(phat)

brier(logitconv$fitted.values, logitconv$y)
brier(logitdeb$fitted.values, logitdeb$y)

ePCP(logitconv$fitted.values, logitconv$y)
ePCP(logitdeb$fitted.values, logitdeb$y)


### Figure 9

# Note that Figure 9 depends upon the replication data for Fearon & Laitin (2003) that is available at: http://www.stanford.edu/group/ethnic/publicdata/publicdata.html

library(foreign)
fl<-read.dta("/Users/brian/Brian/Dichotomous Outcomes/data/Fearon and Laitin APSR 2003/repdata.dta")

fl$onset[fl$onset>1]<-1
model1<-glm(onset ~ warl + gdpenl + lpopl1 + lmtnest + ncontig + Oil + nwstate + instab + polity2l + ethfrac + relfrac, data=fl, family=binomial(link = "logit"))

separationplot(pred=model1$fitted.values, actual=model1$y, lwd1=0.25, heading="Fearon & Laitin Model 1", show.expected=T)

# Now estimate the same model using only gdp:
model2<-glm(onset ~ gdpenl, data=fl, family=binomial(link = "logit"))
separationplot(pred=model2$fitted.values, actual=model2$y, lwd1=0.25, heading="GDP only", show.expected=T)

somers2(model1$fitted.values, model1$y)
somers2(model2$fitted.values, model2$y)

brier<-function(phat, y) (sum((phat-y)^2))/length(phat)

ePCP<-function(phat, y) (sum(phat[y==1]) + sum(1-phat[y==0]))/length(phat)

brier(model1$fitted.values, model1$y)
brier(model2$fitted.values, model2$y)

ePCP(model1$fitted.values, model1$y)
ePCP(model2$fitted.values, model2$y)



### Figure 10:

separationplot(pred=model1$fitted.values, actual=model1$y, lwd1=0.25, heading="Equal widths", type="rect", show.expected=T)

separationplot(pred=model1$fitted.values, actual=model1$y, lwd1=0.25, heading="Highlighting of events", show.expected=T)

separationplot(pred=model1$fitted.values, actual=model1$y, lwd1=0.25, heading="Highlighting of non-events", type="line", zerosfirst=F, show.expected=T)


### Figure 11:

# Download the data from http://gking.harvard.edu/data?dvn_subpage=/faces/study/StudyPage.xhtml?globalId=hdl:1902.1/QTCABXZZRQ

data<-x
model<-glm(turnout~educate+age+income+agesqrd+white, family=binomial(link =
"logit"), data=data)
pred<-model$fitted.values
actuals<-model1$y
separationplot(data$phats, data$Actuals, show.expected=T)


### Figure 12:

separationplot(data$phats, data$Actuals, type="bands")


### Figure 13:

# Download the replication data from http://www2.lse.ac.uk/dataFiles/geographyAndEnvironment/Replication/Article%20for%20JCR%20(Human%20Rights).dta

neumayer<-read.dta("neumayer.dta")

# create a new dataframe called "data4" that just contains the variables we're interested in (and with simpler names).
data6<-na.omit(data.frame(DV=neumayer$aipts, laggedDV=neumayer$laipts, rat=neumayer$iccprmainrat, ingo.pc=neumayer$wiikngointerpc, dem=neumayer$politycorr020, extwar=neumayer$uppsalaexternalincountry, civwar=neumayer$uppsalainternal, gdp=neumayer$lngdp1995pc, pop=neumayer$lnpop, country=neumayer$country, year=neumayer$year))

model6<-polr(as.ordered(DV)~laggedDV +rat + rat:ingo.pc + rat:dem +ingo.pc  +dem +extwar +civwar+gdp +pop, data=data6, Hess=T, method="probit")
summary(model6) 

sp.categorical(pred=model6$fitted.values, actual=as.character(model6$model[,1]), cex=2.5)